/*
	This program is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License version 2 
	as published by the Free Software Foundation.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA


	Copyright (C) 2006  Thierry Berger-Perrin <tbptbp@gmail.com>
*/
#ifndef RT_SSE_APPROX_H
#define RT_SSE_APPROX_H

#include "sse.h"
#include "math_constants.h"

/*
	Some Newton-Raphson refined stuff, up to 23bits of precision.
	Finally got a hold onto ap-803 & other sources. Amended.

	But that note was bogus, and really approx should be used for both rcp/rsqrt NR.
	2006-03-12: amended again (and this is the last time i hope).

	rcp'(a) = 2*rcp(a)  a*rcp(a)
	rsqt'(a) = (0.5)*rsqt(a)*(3-a*rsqt(a))
*/

namespace sse {
	/*
		http://www.azillionmonkeys.com/qed/sqroot.html

		aprox = rsqrt(arg)
		Newton-Raphson invsqrt	= (3-(approx*(approx*arg))*approx*0.5
		sqrt	= invsqrt*arg
	*/
	/*
		Using reciprocal instruction for fast division (P3 and P4)
		On P3 and P4, you can use the fast reciprocal instruction RCPSS or RCPPS on the divisor
		and then multiply with the dividend. However, the precision is only 12 bits. You can increase
		the precision to 23 bits by using the Newton-Raphson method described in Intel's
		application note AP-803:
		x0 = RCPSS(d)
		x1 = x0 * (2 - d * x0) = 2 * x0 - d * x0 * x0
		where x0 is the first approximation to the reciprocal of the divisor d, and x1 is a better
		approximation. You must use this formula before multiplying with the dividend.
		; Example 18.2, fast division, single precision (P3, P4)
		MOVAPS XMM1, [DIVISORS] ; load divisors
		RCPPS XMM0, XMM1 ; approximate reciprocal
		MULPS XMM1, XMM0 ; Newton-Raphson formula
		MULPS XMM1, XMM0
		ADDPS XMM0, XMM0
		SUBPS XMM0, XMM1
		MULPS XMM0, [DIVIDENDS] ; results in XMM0
		This makes four divisions in approximately 26 clock cycles (P4) with a precision of 23 bits.
		Increasing the precision further by repeating the Newton-Raphson formula with double
		precision is possible, but not very advantageous.
	*/
	/*	Reciprocal [2 * rcpps(x) - (x * rcpps(x) * rcpps(x))], x > 0 
		= 1 * (2*approx - ((denominator*approx)*approx))
		arg = 0 -> NaN

		despite what ap-803 says, doing arg*approx is more precise than (arg*approx)*approx
	*/
	static FINLINE __m128 rcp_nr_ps(const __m128 denominator) {
		const __m128 
			nr		= rcpps(denominator),					// approx
			//muls	= mulps(mulps(denominator, nr), nr),	// denominator * approx^2 (order is crucial)
			muls	= mulps(mulps(nr, nr), denominator),	// more precisision
			dbl		= addps(nr,nr),							// 2*approx	
			final	= subps(dbl, muls);

		return final;
	}

	// it's only robust wrt to +/- 0, not infinities, eh: rcp(inf) = 0, 0 = 0, 0*inf = NaN.
	static FINLINE __m128 rcp_nr_robust_ps(const __m128 denominator) {
		/*
			First, we'll check if we were given a +/- 0,
			if that's the case, we'll propagate properly signed inf from the rcp
			hopefully, they survive addition so that's cheap: we'll just nullify those muls.
		*/
		const __m128 
			nr			= rcpps(denominator),					// approx
			//muls		= mulps(mulps(denominator, nr), nr),	// denominator * approx^2 (order is crucial)
			muls		= mulps(mulps(nr, nr), denominator),	// more precisision
			dbl			= addps(nr,nr),							// 2*approx	
			mask		= cmpeq(denominator, all_zero()),		// check if arg was +/- 0.f
			filtered	= andnps(mask, muls),					// nuke bogus muls
			final		= subps(dbl, filtered);					// finish.

		return final;
	}

	// = numerator * (2*approx - ((denominator*approx)*approx))
	// ap-803 says, use either:
	// 2 * rcpps(a) - a * rcpps(a) * rcpps(a) or
	// rcpps(a) * [2 - a * rcpps(a)]
	static FINLINE __m128 div_nr_ps(const __m128 numerator, const __m128 denominator) {
		return mulps(numerator, rcp_nr_robust_ps(denominator));
	}



	/*
		18.15 FSQRT (P3 and P4)
		A fast way of calculating an approximate square root on the P3 and P4 is to multiply the
		reciprocal square root of x by x:
		SQRT(x) = x * RSQRT(x)
		The instruction RSQRTSS or RSQRTPS gives the reciprocal square root with a precision of 12
		bits. You can improve the precision to 23 bits by using the Newton-Raphson formula
		described in Intel's application note AP-803:
		x0 = RSQRTSS(a)
		x1 = 0.5 * x0 * (3 - (a * x0)) * x0)
		where x0 is the first approximation to the reciprocal square root of a, and x1 is a better
		approximation. The order of evaluation is important. You must use this formula before
		multiplying with a to get the square root.

		arg = 0 -> NaN

		Once again, it's (slightly this time) more precise to do nr*arg than (nr*arg)*nr.
	*/
	static FINLINE __m128 rcp_sqrt_nr_ps(const __m128 arg) {
		const __m128
			nr		= rsqrtps(arg),
			muls	= mulps(mulps(nr, nr), arg),
			beta	= mulps(cst::section.half.ps, nr),
			gamma	= subps(cst::section.three.ps, muls);

		return mulps(beta, gamma);
	}

	static FINLINE __m128 rcp_sqrt_nr_robust_ps(const __m128 arg) {
		// instead of using a full cond move in the final stage,
		// it's less expensive to mask the 'muls' term, +/- inf will then survive through.
		const __m128
			mask	= cmpeq(arg, all_zero()),
			nr		= rsqrtps(arg),	
			muls	= mulps(mulps(nr, nr), arg),
			filter	= andnps(mask, muls),
			beta	= mulps(cst::section.half.ps, nr),
			gamma	= subps(cst::section.three.ps, filter),
			final	= mulps(beta, gamma);

		return final;
	}

	/* 
		= arg * 1/sqrt(arg)

		Ah shit! Can't re-use rcp_sqrt_nr_robust_ps because that goes wrong at 0.
		Plus precision is really shitty in the end.
	*/
	static FINLINE __m128 sqrt_nr_ps(const __m128 arg) {
		// return mulps(arg, rcp_sqrt_nr_robust_ps(arg)); 
		const __m128
			mask	= cmpeq(arg, all_zero()),
			nr		= rsqrtps(arg),	
			muls	= mulps(mulps(nr, nr), arg),
			beta	= mulps(cst::section.half.ps, nr),
			gamma	= subps(cst::section.three.ps, muls),
			invsqrt	= mulps(beta, gamma),
			filter	= andnps(mask, invsqrt),
			final	= mulps(arg, filter);

		return final;
	}


	/*
		Scalar versions.

	*/
	static FINLINE float rcp_sqrt_nr_ss(const float &f) {
		const __m128
			arg		= _mm_load_ss(&f),
			nr		= rsqrtss(arg),
			muls	= mulss(mulss(arg, nr), nr),
			beta	= mulss(loadss(&cst::section.half.s), nr),
			gamma	= subss(loadss(&cst::section.three.s), muls),
			final	= mulss(beta, gamma);

		float result;
		_mm_store_ss(&result, final);
		return result;
	}

	#if 0
		static FINLINE float rcp_nr_ss_out(const float &f) {
			const __m128
				denominator = _mm_load_ss(&f),
				nr		= rcpss(denominator),					// approx
				muls	= mulss(mulss(nr, nr), denominator),	// nein! bogus ap-803
				dbl		= addss(nr,nr),							// 2*approx	
				final	= subss(dbl, muls);

			float result;
			_mm_store_ss(&result, final);
			return result;
		}
	#endif
	#define rcp_nr_ss rcp_nr_robust_ss

	// it's not easy to NaN-fix for scalars.
	// let's try with *ps logics (would be a pain with a comiss)
	static FINLINE float rcp_nr_robust_ss(const float f) {
		const __m128
			denominator = _mm_set_ss(f),						// set, not load
			nr			= rcpss(denominator),					// approx
			muls		= mulss(mulss(nr, nr), denominator),	// nein! bogus ap-803
			dbl			= addss(nr,nr),							// 2*approx	
			mask		= cmpeq(denominator, all_zero()),		// check if arg was +/- 0.f
			filtered	= andnps(mask, muls),					// nuke bogus muls
			final		= subss(dbl, filtered);					// finish.

		float result;
		_mm_store_ss(&result, final);
		return result;
	}

	static FINLINE float div_nr_ss(const float numerator, const float denominator) {
		return numerator* rcp_nr_robust_ss(denominator);
	}

}


#endif
